#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.5 - 14-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################
######## export PYTHON_EGG_CACHE=/tmp
import pprint
import os
import nltk
# import rocksdb # shared library kann aktuell noch nicht gelesen werden
import MySQLdb # apt-get install python-mysqldb
from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/
from sphinxit.core.helpers import BaseSearchConfig
from random import randint
import codecs
import sys
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
# import smrzr # https://github.com/lekhakpadmanabh/Summarizer
import re
from transliterate import translit, get_available_language_codes
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
###python -m nltk.downloader -d /usr/share/nltk_data all
####python -m nltk.downloader all
###########nltk.download()
# nltk.download("punkt")
reload(sys)
sys.setdefaultencoding('utf-8')
class SphinxitConfig(BaseSearchConfig):
DEBUG = False
WITH_META = False
WITH_STATUS = False
POOL_SIZE = 5
# SQL_ENGINE = 'oursql'
SEARCHD_CONNECTION = {
'host': '127.0.0.1',
'port': 9977,
}
# delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
# http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html
# https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py
# http://www.tutorialspoint.com/python/python_database_access.htm
# mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working
sphinx = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=9977) # sphinxQL
cursorSphinx = sphinx.cursor()
mysql = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=3306) # Mysql
cursorMysql = mysql.cursor()
def deumlaut(s):
"""
Replaces umlauts with fake-umlauts
"""
s = s.replace('\xdf', 'ss')
s = s.replace('\xfc', 'ue')
s = s.replace('\xdc', 'Ue')
s = s.replace('\xf6', 'oe')
s = s.replace('\xd6', 'Oe')
s = s.replace('\xe4', 'ae')
s = s.replace('\xc4', 'Ae')
return s
def summarizeText(s):
## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers
sentences = nltk.sent_tokenize(s)
sentenceCount = len(sentences)
randSentenceCount = randint(int(sentenceCount - 5), sentenceCount)
# randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount))
parser = PlaintextParser.from_string(s, Tokenizer("german"))
stemmer = Stemmer("german")
# summarizer = TextRankSummarizer(stemmer)
summarizer = Summarizer(stemmer)
summary = summarizer(parser.document, randSentenceCount)
returnText = ""
#ISO-8859-1
for sentence in summary:
returnText += str(sentence)
returnText += " "
return returnText
# Todos:
# create a stopword list in German
# if a stopword is part of a synonym
# give bad minus points
def SynRanker(s,t):
startVal = float(1.0)
lenSyn = len(s)
synHasDigits = any(i.isdigit() for i in s)
synhasSonder = False
delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
re_sonder = r"(\?|\.|\,|\;|\:|\!|\d)"
re_space = r"(\t|\r|\n|\s|\w)"
firstS = s[0:1]
firstT = t[0:1]
if s == t:
startVal -= -0.95
return -1
else:
print
if lenSyn <= 0:
startVal -= -0.99
return -10
else:
print
if lenSyn >= 3 and lenSyn < 14:
startVal += 0
elif lenSyn < 3:
startVal -= 0.65
else:
print
if (' ' in s) and lenSyn >= 14:
startVal -= 0.75
elif (' ' in s) and lenSyn < 14:
startVal -= 0.55
elif (' ' not in s) and lenSyn >= 14:
startVal -= 0.05
elif (' ' not in s) and lenSyn < 14:
startVal += 0.05
elif re.search(re_space, s) is not None:
startVal -= 0.68
else:
print
if re.search(re_sonder, s) is not None:
startVal -= 0.12
synhasSonder = True
else:
print
if firstS.isupper() and firstT.isupper():
startVal += 0.15
elif firstS.islower() and firstT.islower():
startVal += 0.15
elif firstS.isupper() and not firstT.isupper():
startVal -= 0.45
elif firstS.islower() and not firstT.islower():
startVal -= 0.45
else:
print
#print("Synonym: ", s)
#print("
")
#print("Length: ", lenSyn)
#print("
")
# print("Digits: ", synHasDigits)
#print("
")
#print("Space: ", (' ' in s))
#print("
")
#print("Sonderzeichen: ", synhasSonder)
#print("
")
#print("SynRank: ", startVal)
#print("
")
#print("---------------------------------------------------
")
# later ResultCodes
return float(startVal)
def SynDictCalculator(s):
synDict = {}
scount = 0
for cSyn in s:
rank = SynRanker(cSyn)
synDict[rank] = cSyn
scount += 1
return synDict
def iround(x):
"""iround(number) -> integer
Round a number to the nearest integer."""
return int(round(x) - .5) + (x > 0)
inputfile = sys.argv[1]
outputfile = sys.argv[2]
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
#text.decode('utf-8')
text = text.decode("ISO-8859-1")
# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy = summarizeText(text)
tokens = nltk.word_tokenize(tSumy)
tokensRaw = nltk.word_tokenize(tSumy)
count = -1
changeEveryWord = 9 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag = 0
changeEveryWordTemp = 0 #temporary upcount
for word in tokens:
count += 1
# cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (word))
name_content = cursorMysql.fetchone()
#print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
#print (name_content)
# es wurde ein namen gefunden -> kein synonym austauschen
if name_content is not None:
# print("Token: ", tokens)
#print("Count: ", count)
#print("
")
#print("Tokencount overall: ", len(tokens))
#print("
")
tokens[count] = '' + deumlaut(word) + ''
tokensRaw[count] = deumlaut(word)
# print "Namen erkannt und nicht getauscht"
continue
else:
print
if changeEveryWordTemp == (changeEveryWord - 1):
changeEveryWordFlag = 0
changeEveryWordTemp = 0
else:
print()
if changeEveryWordFlag == 1:
changeEveryWordTemp += 1
else:
print()
if len(word) >= 2 and changeEveryWordFlag == 0:
lstcWord = word[0:1]
# 1. check if NamensDB eintrag -> y: write protect this entry
# 2. check if Synonym_Unique -> y: take syononmy rand[0-4] -> 4 if > then 4 synonyms
# search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
# # search_query = search_query.match(word).options(
# search_query = search_query.match(word).options(
# ranker='proximity_bm25',
# max_matches=1,
# max_query_time=350,
# field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
# )
###sphinx_result = search_query.ask()
# exit(0)
search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig)
search_query_syn = search_query_syn.match(word).options(
ranker='proximity_bm25',
max_matches=1,
max_query_time=350,
field_weights={'synonyms': 100},
)
sphinx_result_syn = search_query_syn.ask()
synID = 0
try:
synID = sphinx_result_syn['result']['items'][0].values()[0]
if synID > 0:
# print "SynDB has been found: ", synID
#später finde via sphinx noch mehr synonyme und parse diese alle
sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
cursorMysql.execute(sql)
syn_content = cursorMysql.fetchone()
synContent = list(syn_content)
synContent = synContent[0].decode(encoding="utf-8", errors="ignore")
if syn_content:
synwords = synContent.split(";")
# print SynDictCalculator(synwords)
# http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
# for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
# print "%s: %s" % (key, value)
synDict = {}
for wSyn in synwords:
#synDict[SynRanker(wSyn, word)] = wSyn
synDict[wSyn] = SynRanker(wSyn, word)
sortedSynList = []
sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
firstBestSynHit = sortedSynList[0][0]
firstBestSynHitRank = str(sortedSynList[0][1])
#print(sortedSynList)
#print("
Best Key: ", type(sortedSynList[0]))
#print("
Best Value: ", sortedSynList[0][0])
# later: Randomly choose one of the synonyms that have all the highest rating
tokens[count] = '' + deumlaut(firstBestSynHit) + ''
tokensRaw[count] = deumlaut(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
#break
except IndexError:
print
# file schreiben
outputtext = ' '.join(tokens)
outputtextRussia = ' '.join(tokensRaw)
with codecs.open(outputfile, 'w') as f:
f.write(outputtext)
f.write("
")
f.write("RUSSISCHE TRANSLITERATION:BEISPIEL VERSION")
f.write("
")
f.write(translit(outputtextRussia, 'ru'))
f.close()
mysql.close()
exit(0)